
package edu.utah.seq.parsers;

import java.io.*;
import java.util.regex.*;
import edu.utah.seq.data.*;
import util.gen.*;
import java.util.*;

/**Parses a Bowtie alignment txt file into point data, split by chromosome and strand.
 * #Name	Ori	Chrom	StartOfRead	Sequence	BaseQualities	Nada	Mismatches
 * 0	+	gi|17981852|ref|NC_001807.4|	0	GATCACAGGTCTATCACCCTATTAACCACTCACAGGAGCTCTCGATGCAT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	0	33:G>A,43:C>G
 * For each sequence a single hit is assigned to the center position of the read.  Files are saved using the bar format.
 * Final positions are in interbase coordinates (0 start, end excluded).
 * @author david.nix@hci.utah.edu 
 **/
public class BowtieParser {
	//fields
	private File[] dataFiles;
	private File saveDirectory;
	private File workingFile;
	private String versionedGenome;
	private int readLength;
	private HashMap <String, ArrayList<Point>> data = new HashMap <String, ArrayList<Point>> ();
	private int totalNumMatch = 0;
	private Pattern tab = Pattern.compile("\\t");
	private ArrayList<File> tempDataDirectories = new ArrayList<File>();

	//constructors
	public BowtieParser(String[] args){
		long startTime = System.currentTimeMillis();
		processArgs(args);
		System.out.println("\nConverting...");

		//for each file, parse and split by strand and chromosome and save to disk
		for (int i=0; i< dataFiles.length; i++){
			//set working objects and parse tag file name
			workingFile = dataFiles[i];
			System.out.println("\t"+workingFile);
			//split file to chromosome strand specific temp files
			if (parseWorkingFile() == false) Misc.printExit("\nError: failed to parse, aborting.\n");
			//save to disk
			writeWorkingHashMap();
			//clean up
			data = null;
		}
		
		//generate a HashMap of strandedChromosomes and files to merge
		HashMap<String,ArrayList<File>> chrFiles = collectStrandedFiles();
		
		//load, sort, make point data, and save
		System.out.print("Sorting and Saving");
		makePointData(chrFiles);
		System.out.println();
		
		//cleanup
		for (int i=0; i< tempDataDirectories.size(); i++) IO.deleteDirectory(tempDataDirectories.get(i));
		
		//stats
		System.out.println("Stats...");
		System.out.println("\t"+totalNumMatch+"\tAlignments");
		
		//finish and calc run time
		double diffTime = ((double)(System.currentTimeMillis() -startTime))/1000;
		System.out.println("\nDone! "+Math.round(diffTime)+" seconds\n");
	}
	
	public HashMap<String,ArrayList<File>> collectStrandedFiles(){
		HashMap<String,ArrayList<File>> chrFiles = new HashMap<String,ArrayList<File>>();
		//for each temp data directory get names and load into hash
		for (int i=0; i< tempDataDirectories.size(); i++){
			HashMap<String, File> x = IO.fetchNamesAndDirectories(tempDataDirectories.get(i));
			//add to merged
			Iterator<String> it = x.keySet().iterator();
			while (it.hasNext()){
				String chr = it.next();
				//fetch ArrayList
				ArrayList<File> al;
				if (chrFiles.containsKey(chr)) al = chrFiles.get(chr);
				else {
					al = new ArrayList<File>();
					chrFiles.put(chr, al);
				}
				//add File
				al.add(x.get(chr));
			}
		}
		return chrFiles;
	}
	
	/**Makes the PointData hash after sorting, writes to disk.*/
	public void makePointData(HashMap<String,ArrayList<File>> chrFiles){
		ComparatorPointPosition comp = new ComparatorPointPosition();
		int halfReadLength = (int)Math.round(((double)readLength)/2);
		//for each stranded chromosome
		Iterator<String> it = chrFiles.keySet().iterator();
		while (it.hasNext()){
			String chromName = it.next();
			//parse strand and chromosome
			int len = chromName.length();
			String strand = chromName.substring(len-1);
			String chromosome = chromName.substring(0, len-2);
			//fetchFiles, load and merge
			ArrayList<File> al = chrFiles.get(chromName);
			ArrayList<Point> pointsAL = new ArrayList<Point>();
			for (int i=0; i< al.size(); i++){
				pointsAL.addAll((ArrayList<Point>)IO.fetchObject(al.get(i)));
			}
			//convert to Point[] and sort
			Point[] points = new Point[pointsAL.size()];
			pointsAL.toArray(points);
			Arrays.sort(points, comp);			
			//make notes
			HashMap <String,String> notes = new HashMap <String,String> ();
			notes.put(BarParser.GRAPH_TYPE_TAG, BarParser.GRAPH_TYPE_BAR);
			notes.put(BarParser.SOURCE_TAG, IO.concatinateFileFullPathNames(dataFiles, ","));
			notes.put(BarParser.STRAND_TAG, strand);
			notes.put(BarParser.UNIT_TAG, "Bowtie quality score");
			notes.put(BarParser.READ_LENGTH_TAG, readLength+"");
			notes.put(BarParser.DESCRIPTION_TAG, "Generated by running the BowtieParser on Bowtie alignment =file(s), the position is assigned to the middle of the read, interbase coordinates");
			//make an Info object  public Info (String name, String versionedGenome, String chromosome, String strand, int readLength, HashMap<String,String> notes){
			Info info = new Info(chromName, versionedGenome, chromosome, strand, readLength, notes);
			//make pd
			PointData pd = Point.extractPositionScores(points);			
			pd.setInfo(info);
			//convert start positions to middle positions
			int[] pos = pd.getPositions();
			for (int i=0; i< pos.length; i++) pos[i] = pos[i]+halfReadLength;
			pd.setPositions(pos);
			//write to file
			pd.writePointData(saveDirectory);
			//cleanup
			pd = null;
			al = null;
			points = null;
			System.out.print(".");
		}
	}


	public void writeWorkingHashMap(){
		//make tempDirectory to hold stranded chromosomes
		String rnd = Passwords.createRandowWord(8);
		File dir = new File (saveDirectory, "TempDir_"+rnd);
		dir.mkdir();
		tempDataDirectories.add(dir);
		
		//iterate through hashmap saving serialized objects
		Iterator<String> it = data.keySet().iterator();
		while (it.hasNext()){
			String name = it.next();
			File file = new File (dir, name);
			IO.saveObject(file, data.get(name));
		}
	}
	
	/**Splits a tag file by chromosome and strand to separate files.*/
	public boolean parseWorkingFile(){
		try{
			//get reader
			BufferedReader in = IO.fetchBufferedReader(workingFile);
			String line;
			String[] tokens = null;
			data = new HashMap<String, ArrayList<Point>>();
			
			/*read in lines
			* #Name Ori Chrom StartOfRead Sequence BaseQualities Reserved Mismatches
			* 0 + chr3 34433460 GATCACAGGT IIIIIIIIIIIIIIII 0 33:G>A,43:C>G
			*/
			while ((line = in.readLine()) !=null){
				tokens = tab.split(line);
				//make chrStrand
				String chrStrand = tokens[2]+tokens[1];
				//fetch ArrayList
				ArrayList<Point> al;
				if (data.containsKey(chrStrand)) al = data.get(chrStrand);
				else {
					al = new ArrayList<Point>();
					data.put(chrStrand, al);
				}
				//add Point
				int startPosition = Integer.parseInt(tokens[3]);
				float score = 0;
				al.add(new Point(startPosition, score));
			}
			//set read length
			readLength = tokens[4].length();
			in.close();
			return true;
		} catch (Exception e){
			e.printStackTrace();
			return false;
		}
	}


	public static void main(String[] args) {
		if (args.length ==0){
			printDocs();
			System.exit(0);
		}
		new BowtieParser(args);
	}		

	/**This method will process each argument and assign new varibles*/
	public void processArgs(String[] args){
		Pattern pat = Pattern.compile("-[a-z]");
		File forExtraction = null;
		System.out.println("\nArguments: "+Misc.stringArrayToString(args, " ")+"\n");
		for (int i = 0; i<args.length; i++){
			String lcArg = args[i].toLowerCase();
			Matcher mat = pat.matcher(lcArg);
			if (mat.matches()){
				char test = args[i].charAt(1);
				try{
					switch (test){
					case 'f': forExtraction = new File(args[i+1]); i++; break;
					case 'v': versionedGenome = args[i+1]; i++; break;
					case 'r': saveDirectory = new File (args[i+1]); i++; break;
					case 'h': printDocs(); System.exit(0);
					default: System.out.println("\nProblem, unknown option! " + mat.group());
					}
				}
				catch (Exception e){
					Misc.printExit("\nSorry, something doesn't look right with this parameter: -"+test+"\n");
				}
			}
		}
		
		//pull files
		File[][] tot = new File[3][];
		tot[0] = IO.extractFiles(forExtraction,".txt");
		tot[1] = IO.extractFiles(forExtraction,".txt.zip");
		tot[2] = IO.extractFiles(forExtraction,".txt.gz");

		dataFiles = IO.collapseFileArray(tot);
		if (dataFiles == null || dataFiles.length==0) dataFiles = IO.extractFiles(forExtraction);
		if (dataFiles == null || dataFiles.length ==0 || dataFiles[0].canRead() == false) Misc.printExit("\nError: cannot find your xxx.txt(.zip/.gz) file(s)!\n");
		if (versionedGenome == null) Misc.printExit("\nPlease enter a genome version recognized by UCSC, see http://genome.ucsc.edu/FAQ/FAQreleases.\n");
		if (saveDirectory == null) {
			saveDirectory = new File (dataFiles[0].getParentFile(), "PointData");
			saveDirectory.mkdir();
		}
		else if (saveDirectory.exists() == false) saveDirectory.mkdir();
	}	

	public static void printDocs(){
		System.out.println("\n" +
				"**************************************************************************************\n" +
				"**                               BowtieParser: Nov 2008                             **\n" +
				"**************************************************************************************\n" +
				"Splits and converts Bowtie alignment txt files into center position alignment scored\n" +
				"binary PointData xxx.bar files. Interbase coordiantes (zero based, end excluded).\n" +
				"These can be directly viewed in IGB.\n\n" +

				"-v Versioned Genome (ie hg18, dm2, ce2, mm8), see UCSC Browser,\n"+
				"      http://genome.ucsc.edu/FAQ/FAQreleases.\n" +
				"-f The full path directory/file name of your Bowtie xxx.txt(.zip or .gz) file(s).\n" +
				"-r Full path directory name for saving the results, defaults to input directory.\n"+


				"\nExample: java -Xmx1500M -jar pathToUSeq/Apps/BowtieParser -f /Bowtie/Run7/\n" +
				"     -v H_sapiens_Mar_2006\n\n" +

				"**************************************************************************************\n");

	}	

}
